Changes the age, city category and occupation variables from integers to factors, provides the variable types and checks for any missing values, sets seed, gets rid of scientific notation

BlackFridayDF <- read.csv("/Users/kathleengendotti/Downloads/BlackFriday.csv")
BlackFridayDF$Age <- as.factor(BlackFridayDF$Age)
BlackFridayDF$City_Category <- as.factor(BlackFridayDF$City_Category)
BlackFridayDF$Occupation <- as.factor(BlackFridayDF$Occupation)
colSums(is.na(BlackFridayDF))
##                    User_ID                 Product_ID 
##                          0                          0 
##                     Gender                        Age 
##                          0                          0 
##                 Occupation              City_Category 
##                          0                          0 
## Stay_In_Current_City_Years             Marital_Status 
##                          0                          0 
##         Product_Category_1         Product_Category_2 
##                          0                     166986 
##         Product_Category_3                   Purchase 
##                     373299                          0
BlackFridayDF <- BlackFridayDF[!is.na(BlackFridayDF$Product_Category_1), ]
BlackFridayDF <- BlackFridayDF[!is.na(BlackFridayDF$Product_Category_2), ]
BlackFridayDF <- BlackFridayDF[!is.na(BlackFridayDF$Product_Category_3), ]
BlackFridayDF <- BlackFridayDF[!is.na(BlackFridayDF$User_ID), ]
BlackFridayDF <- BlackFridayDF[!is.na(BlackFridayDF$Product_ID), ]
colSums(is.na(BlackFridayDF))
##                    User_ID                 Product_ID 
##                          0                          0 
##                     Gender                        Age 
##                          0                          0 
##                 Occupation              City_Category 
##                          0                          0 
## Stay_In_Current_City_Years             Marital_Status 
##                          0                          0 
##         Product_Category_1         Product_Category_2 
##                          0                          0 
##         Product_Category_3                   Purchase 
##                          0                          0
set.seed(1861)
options(scipen=15)

Gives mean, median, 25th and 75th quartiles, min, and max of the variables

summary(BlackFridayDF)
##     User_ID            Product_ID     Gender        Age       
##  Min.   :1000001   P00110742:  1591   F: 36932   0-17 : 4789  
##  1st Qu.:1001497   P00025442:  1586   M:127346   18-25:30889  
##  Median :1003053   P00112142:  1539              26-35:65916  
##  Mean   :1003000   P00057642:  1430              36-45:32758  
##  3rd Qu.:1004418   P00184942:  1424              46-50:13135  
##  Max.   :1006040   P00046742:  1417              51-55:11018  
##                    (Other)  :155291              55+  : 5773  
##    Occupation    City_Category Stay_In_Current_City_Years Marital_Status  
##  4      :22076   A:40848       0 :22061                   Min.   :0.0000  
##  0      :20677   B:68185       1 :57297                   1st Qu.:0.0000  
##  7      :17542   C:55245       2 :31040                   Median :0.0000  
##  17     :13844                 3 :28886                   Mean   :0.4022  
##  1      :13209                 4+:24994                   3rd Qu.:1.0000  
##  12     :10304                                            Max.   :1.0000  
##  (Other):66626                                                            
##  Product_Category_1 Product_Category_2 Product_Category_3    Purchase    
##  Min.   : 1.000     Min.   : 2.000     Min.   : 3.00      Min.   :  185  
##  1st Qu.: 1.000     1st Qu.: 2.000     1st Qu.: 9.00      1st Qu.: 7871  
##  Median : 1.000     Median : 6.000     Median :14.00      Median :11757  
##  Mean   : 2.742     Mean   : 6.896     Mean   :12.67      Mean   :11661  
##  3rd Qu.: 4.000     3rd Qu.:10.000     3rd Qu.:16.00      3rd Qu.:15627  
##  Max.   :15.000     Max.   :16.000     Max.   :18.00      Max.   :23959  
## 

Histogram of purchase

hist(BlackFridayDF$Purchase, col = 8, xlab = "Purchase", ylab = "Count", main = "Purchase vs. count")

This pie chart shows the distribution of total purchases by age group

library('plotly')
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
pieAge <- plot_ly(BlackFridayDF, labels = ~Age, values = ~Purchase, type = 'pie') %>%
  layout(title = 'Age Group Purchases',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
pieAge

Pie chart of purchases by gender

pieGender <- plot_ly(BlackFridayDF, labels = ~Gender, values = ~Purchase, type = 'pie') %>%
  layout(title = 'Gender Group Purchases',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
pieGender

These boxplots shows the distribution of purchases by gender

ggplot(aes(x=Gender, y=Purchase, fill=Gender), data = BlackFridayDF)+geom_boxplot()

Histogram plots of purchase by occupation

BlackFridayDF$Purchase <- log(BlackFridayDF$Purchase)
library("ggplot2")
ggplot(aes(Purchase), data = BlackFridayDF) + geom_histogram() + theme_light() + labs(title = "Plot of Total Purchases by Occupation", x = "Purchase (in dollars)") + facet_wrap(~Occupation)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

This plot shows the frequency of various total purchase values colored by the number of years the individual has lived in their current city.

ggplot(aes(Purchase, fill = City_Category), data = BlackFridayDF) + geom_histogram() + theme_light() + labs(title = "Plot of Total Purchase Price Frequency by Number of Years Lived in Current City", x = "Purchase (in dollars)")+ facet_wrap(~Stay_In_Current_City_Years)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

These plots show the plots of purchases separated by the city catgeories and colored in by the number of years the individual has lived in their current city.

ggplot(aes(Purchase, fill = Stay_In_Current_City_Years), data = BlackFridayDF) + geom_histogram() + theme_light() + labs(title = "Plot of Total Purchase Price Frequency by City Catgeory", subtitle = "Colored by Number of Years Lived in Current City", x = "Purchase (in dollars)") + facet_wrap(~City_Category)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Linear Regression Model

Linear_regression <- lm(formula = Purchase ~ Gender + Age + Occupation + City_Category + Stay_In_Current_City_Years + Marital_Status, data = BlackFridayDF)
summary(Linear_regression)
## 
## Call:
## lm(formula = Purchase ~ Gender + Age + Occupation + City_Category + 
##     Stay_In_Current_City_Years + Marital_Status, data = BlackFridayDF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0875 -0.2661  0.1432  0.4259  1.0320 
## 
## Coefficients:
##                               Estimate Std. Error t value
## (Intercept)                   9.065596   0.015098 600.452
## GenderM                       0.068095   0.003668  18.567
## Age18-25                      0.028712   0.014328   2.004
## Age26-35                      0.035215   0.014343   2.455
## Age36-45                      0.043312   0.014558   2.975
## Age46-50                      0.031847   0.015266   2.086
## Age51-55                      0.072111   0.015463   4.663
## Age55+                        0.037087   0.016528   2.244
## Occupation1                  -0.017458   0.006673  -2.616
## Occupation2                  -0.010424   0.008074  -1.291
## Occupation3                   0.022003   0.009435   2.332
## Occupation4                   0.011901   0.006212   1.916
## Occupation5                   0.041413   0.010467   3.957
## Occupation6                   0.033344   0.008977   3.714
## Occupation7                   0.029981   0.006176   4.855
## Occupation8                   0.043157   0.026014   1.659
## Occupation9                  -0.014556   0.014726  -0.988
## Occupation10                 -0.031097   0.015151  -2.053
## Occupation11                  0.002420   0.011274   0.215
## Occupation12                  0.052555   0.007196   7.303
## Occupation13                 -0.005548   0.014804  -0.375
## Occupation14                  0.056893   0.007752   7.339
## Occupation15                  0.045649   0.010377   4.399
## Occupation16                  0.035008   0.008217   4.260
## Occupation17                  0.049470   0.006593   7.503
## Occupation18                  0.016311   0.014068   1.159
## Occupation19                 -0.062412   0.012899  -4.839
## Occupation20                 -0.025280   0.007516  -3.363
## City_CategoryB                0.027296   0.003751   7.278
## City_CategoryC                0.098321   0.003961  24.821
## Stay_In_Current_City_Years1   0.012955   0.004729   2.739
## Stay_In_Current_City_Years2   0.023914   0.005248   4.557
## Stay_In_Current_City_Years3   0.012292   0.005339   2.302
## Stay_In_Current_City_Years4+  0.015715   0.005510   2.852
## Marital_Status               -0.003435   0.003199  -1.074
##                                          Pr(>|t|)    
## (Intercept)                  < 0.0000000000000002 ***
## GenderM                      < 0.0000000000000002 ***
## Age18-25                                 0.045081 *  
## Age26-35                                 0.014079 *  
## Age36-45                                 0.002928 ** 
## Age46-50                                 0.036969 *  
## Age51-55                       0.0000031132431326 ***
## Age55+                                   0.024845 *  
## Occupation1                              0.008892 ** 
## Occupation2                              0.196671    
## Occupation3                              0.019693 *  
## Occupation4                              0.055405 .  
## Occupation5                    0.0000760330307448 ***
## Occupation6                              0.000204 ***
## Occupation7                    0.0000012067735603 ***
## Occupation8                              0.097112 .  
## Occupation9                              0.322943    
## Occupation10                             0.040122 *  
## Occupation11                             0.830017    
## Occupation12                   0.0000000000002820 ***
## Occupation13                             0.707857    
## Occupation14                   0.0000000000002160 ***
## Occupation15                   0.0000108812696285 ***
## Occupation16                   0.0000204208396818 ***
## Occupation17                   0.0000000000000627 ***
## Occupation18                             0.246255    
## Occupation19                   0.0000013090411186 ***
## Occupation20                             0.000770 ***
## City_CategoryB                 0.0000000000003410 ***
## City_CategoryC               < 0.0000000000000002 ***
## Stay_In_Current_City_Years1              0.006155 ** 
## Stay_In_Current_City_Years2    0.0000052037786125 ***
## Stay_In_Current_City_Years3              0.021320 *  
## Stay_In_Current_City_Years4+             0.004346 ** 
## Marital_Status                           0.282908    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5939 on 164243 degrees of freedom
## Multiple R-squared:  0.01058,    Adjusted R-squared:  0.01038 
## F-statistic: 51.68 on 34 and 164243 DF,  p-value: < 0.00000000000000022

Generalized Linear Model

library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
logitFit <- glm(Purchase ~ Gender + Age + Occupation + City_Category + Stay_In_Current_City_Years + Marital_Status, data = BlackFridayDF)
summary(logitFit)
## 
## Call:
## glm(formula = Purchase ~ Gender + Age + Occupation + City_Category + 
##     Stay_In_Current_City_Years + Marital_Status, data = BlackFridayDF)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.0875  -0.2661   0.1432   0.4259   1.0320  
## 
## Coefficients:
##                               Estimate Std. Error t value
## (Intercept)                   9.065596   0.015098 600.452
## GenderM                       0.068095   0.003668  18.567
## Age18-25                      0.028712   0.014328   2.004
## Age26-35                      0.035215   0.014343   2.455
## Age36-45                      0.043312   0.014558   2.975
## Age46-50                      0.031847   0.015266   2.086
## Age51-55                      0.072111   0.015463   4.663
## Age55+                        0.037087   0.016528   2.244
## Occupation1                  -0.017458   0.006673  -2.616
## Occupation2                  -0.010424   0.008074  -1.291
## Occupation3                   0.022003   0.009435   2.332
## Occupation4                   0.011901   0.006212   1.916
## Occupation5                   0.041413   0.010467   3.957
## Occupation6                   0.033344   0.008977   3.714
## Occupation7                   0.029981   0.006176   4.855
## Occupation8                   0.043157   0.026014   1.659
## Occupation9                  -0.014556   0.014726  -0.988
## Occupation10                 -0.031097   0.015151  -2.053
## Occupation11                  0.002420   0.011274   0.215
## Occupation12                  0.052555   0.007196   7.303
## Occupation13                 -0.005548   0.014804  -0.375
## Occupation14                  0.056893   0.007752   7.339
## Occupation15                  0.045649   0.010377   4.399
## Occupation16                  0.035008   0.008217   4.260
## Occupation17                  0.049470   0.006593   7.503
## Occupation18                  0.016311   0.014068   1.159
## Occupation19                 -0.062412   0.012899  -4.839
## Occupation20                 -0.025280   0.007516  -3.363
## City_CategoryB                0.027296   0.003751   7.278
## City_CategoryC                0.098321   0.003961  24.821
## Stay_In_Current_City_Years1   0.012955   0.004729   2.739
## Stay_In_Current_City_Years2   0.023914   0.005248   4.557
## Stay_In_Current_City_Years3   0.012292   0.005339   2.302
## Stay_In_Current_City_Years4+  0.015715   0.005510   2.852
## Marital_Status               -0.003435   0.003199  -1.074
##                                          Pr(>|t|)    
## (Intercept)                  < 0.0000000000000002 ***
## GenderM                      < 0.0000000000000002 ***
## Age18-25                                 0.045081 *  
## Age26-35                                 0.014079 *  
## Age36-45                                 0.002928 ** 
## Age46-50                                 0.036969 *  
## Age51-55                       0.0000031132431326 ***
## Age55+                                   0.024845 *  
## Occupation1                              0.008892 ** 
## Occupation2                              0.196671    
## Occupation3                              0.019693 *  
## Occupation4                              0.055405 .  
## Occupation5                    0.0000760330307448 ***
## Occupation6                              0.000204 ***
## Occupation7                    0.0000012067735603 ***
## Occupation8                              0.097112 .  
## Occupation9                              0.322943    
## Occupation10                             0.040122 *  
## Occupation11                             0.830017    
## Occupation12                   0.0000000000002820 ***
## Occupation13                             0.707857    
## Occupation14                   0.0000000000002160 ***
## Occupation15                   0.0000108812696285 ***
## Occupation16                   0.0000204208396818 ***
## Occupation17                   0.0000000000000627 ***
## Occupation18                             0.246255    
## Occupation19                   0.0000013090411186 ***
## Occupation20                             0.000770 ***
## City_CategoryB                 0.0000000000003410 ***
## City_CategoryC               < 0.0000000000000002 ***
## Stay_In_Current_City_Years1              0.006155 ** 
## Stay_In_Current_City_Years2    0.0000052037786125 ***
## Stay_In_Current_City_Years3              0.021320 *  
## Stay_In_Current_City_Years4+             0.004346 ** 
## Marital_Status                           0.282908    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.3527084)
## 
##     Null deviance: 58550  on 164277  degrees of freedom
## Residual deviance: 57930  on 164243  degrees of freedom
## AIC: 295041
## 
## Number of Fisher Scoring iterations: 2
BlackFridayDF$PurchaseHigh <- ifelse(BlackFridayDF$Purchase > median(BlackFridayDF$Purchase),1,0)

doBy::summaryBy(PurchaseHigh ~ Gender + Age + Occupation + City_Category + Stay_In_Current_City_Years + Marital_Status, data = BlackFridayDF)
tFit <- glm(PurchaseHigh ~ Gender + Age + Occupation + City_Category + Stay_In_Current_City_Years + Marital_Status, data = BlackFridayDF, family = binomial)
summary(tFit)
## 
## Call:
## glm(formula = PurchaseHigh ~ Gender + Age + Occupation + City_Category + 
##     Stay_In_Current_City_Years + Marital_Status, family = binomial, 
##     data = BlackFridayDF)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3826  -1.1720  -0.9489   1.1668   1.4245  
## 
## Coefficients:
##                               Estimate Std. Error z value
## (Intercept)                  -0.513669   0.051163 -10.040
## GenderM                       0.236585   0.012436  19.024
## Age18-25                      0.088867   0.048522   1.831
## Age26-35                      0.079288   0.048578   1.632
## Age36-45                      0.095505   0.049305   1.937
## Age46-50                      0.059914   0.051705   1.159
## Age51-55                      0.175756   0.052373   3.356
## Age55+                        0.045900   0.055962   0.820
## Occupation1                  -0.062909   0.022599  -2.784
## Occupation2                  -0.031010   0.027329  -1.135
## Occupation3                   0.074070   0.031897   2.322
## Occupation4                   0.063171   0.021000   3.008
## Occupation5                   0.099729   0.035372   2.819
## Occupation6                   0.115319   0.030344   3.800
## Occupation7                   0.124448   0.020887   5.958
## Occupation8                   0.115615   0.087946   1.315
## Occupation9                  -0.079018   0.050148  -1.576
## Occupation10                 -0.050735   0.051285  -0.989
## Occupation11                  0.030601   0.038076   0.804
## Occupation12                  0.190125   0.024361   7.805
## Occupation13                 -0.112523   0.050038  -2.249
## Occupation14                  0.176812   0.026232   6.740
## Occupation15                  0.130918   0.035086   3.731
## Occupation16                  0.096347   0.027770   3.469
## Occupation17                  0.173334   0.022322   7.765
## Occupation18                  0.050993   0.047540   1.073
## Occupation19                 -0.111961   0.043750  -2.559
## Occupation20                 -0.084326   0.025462  -3.312
## City_CategoryB                0.087646   0.012687   6.908
## City_CategoryC                0.315020   0.013417  23.478
## Stay_In_Current_City_Years1   0.045273   0.016007   2.828
## Stay_In_Current_City_Years2   0.089705   0.017767   5.049
## Stay_In_Current_City_Years3   0.050393   0.018076   2.788
## Stay_In_Current_City_Years4+  0.037964   0.018646   2.036
## Marital_Status               -0.009841   0.010828  -0.909
##                                          Pr(>|z|)    
## (Intercept)                  < 0.0000000000000002 ***
## GenderM                      < 0.0000000000000002 ***
## Age18-25                                 0.067026 .  
## Age26-35                                 0.102643    
## Age36-45                                 0.052744 .  
## Age46-50                                 0.246550    
## Age51-55                                 0.000791 ***
## Age55+                                   0.412103    
## Occupation1                              0.005375 ** 
## Occupation2                              0.256517    
## Occupation3                              0.020223 *  
## Occupation4                              0.002628 ** 
## Occupation5                              0.004811 ** 
## Occupation6                              0.000144 ***
## Occupation7                   0.00000000255093537 ***
## Occupation8                              0.188639    
## Occupation9                              0.115098    
## Occupation10                             0.322524    
## Occupation11                             0.421589    
## Occupation12                  0.00000000000000597 ***
## Occupation13                             0.024528 *  
## Occupation14                  0.00000000001579251 ***
## Occupation15                             0.000190 ***
## Occupation16                             0.000521 ***
## Occupation17                  0.00000000000000816 ***
## Occupation18                             0.283430    
## Occupation19                             0.010495 *  
## Occupation20                             0.000927 ***
## City_CategoryB                0.00000000000491282 ***
## City_CategoryC               < 0.0000000000000002 ***
## Stay_In_Current_City_Years1              0.004679 ** 
## Stay_In_Current_City_Years2   0.00000044386005160 ***
## Stay_In_Current_City_Years3              0.005305 ** 
## Stay_In_Current_City_Years4+             0.041752 *  
## Marital_Status                           0.363414    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 227738  on 164277  degrees of freedom
## Residual deviance: 226132  on 164243  degrees of freedom
## AIC: 226202
## 
## Number of Fisher Scoring iterations: 4

Lasso vs. Ridge Models

myFormula <- as.formula(Purchase ~ Gender + Age + Occupation + City_Category + Stay_In_Current_City_Years + Marital_Status)

library("useful")
Xvar <- build.x(myFormula, data=BlackFridayDF)
Yvar <- build.y(myFormula, data=BlackFridayDF)
library("glmnet")
LassoMod <- cv.glmnet(x = Xvar, y = Yvar, alpha = 1, nfolds=10)
RidgeMod <- cv.glmnet(x = Xvar, y = Yvar, alpha = 0, nfolds=10)

Lassomin = coef(LassoMod, s = "lambda.min")
Lasso1se = coef(LassoMod, s = "lambda.1se")
Ridgemin = coef(RidgeMod, s = "lambda.min")
Ridge1se = coef(RidgeMod, s = "lambda.1se")

Lassomin = round(Lassomin, digits = 4)
Ridgemin = round(Ridgemin, digits = 4)
Lasso1se = round(Lasso1se, digits = 4)
Ridge1se = round(Ridge1se, digits = 4)

table1 = cbind(Lassomin,Ridgemin)
colnames(table1) = c("Lasso", "Ridge")
print(table1)
## 36 x 2 sparse Matrix of class "dgCMatrix"
##                                Lasso   Ridge
## (Intercept)                   9.0728  9.0857
## (Intercept)                   .       .     
## GenderM                       0.0682  0.0678
## Age18-25                      0.0214  0.0106
## Age26-35                      0.0278  0.0167
## Age36-45                      0.0359  0.0249
## Age46-50                      0.0244  0.0134
## Age51-55                      0.0647  0.0534
## Age55+                        0.0297  0.0187
## Occupation1                  -0.0173 -0.0179
## Occupation2                  -0.0103 -0.0109
## Occupation3                   0.0222  0.0214
## Occupation4                   0.0120  0.0113
## Occupation5                   0.0416  0.0409
## Occupation6                   0.0335  0.0329
## Occupation7                   0.0301  0.0295
## Occupation8                   0.0430  0.0421
## Occupation9                  -0.0143 -0.0149
## Occupation10                 -0.0370 -0.0462
## Occupation11                  0.0026  0.0022
## Occupation12                  0.0527  0.0519
## Occupation13                 -0.0053 -0.0055
## Occupation14                  0.0570  0.0561
## Occupation15                  0.0458  0.0451
## Occupation16                  0.0352  0.0345
## Occupation17                  0.0496  0.0490
## Occupation18                  0.0164  0.0160
## Occupation19                 -0.0629 -0.0639
## Occupation20                 -0.0251 -0.0256
## City_CategoryB                0.0272  0.0263
## City_CategoryC                0.0983  0.0969
## Stay_In_Current_City_Years1   0.0130  0.0125
## Stay_In_Current_City_Years2   0.0239  0.0234
## Stay_In_Current_City_Years3   0.0123  0.0118
## Stay_In_Current_City_Years4+  0.0157  0.0153
## Marital_Status               -0.0033 -0.0030
table2 = cbind(Lasso1se, Ridge1se)
colnames(table2) = c("Lasso", "Ridge")
print(table2)
## 36 x 2 sparse Matrix of class "dgCMatrix"
##                               Lasso   Ridge
## (Intercept)                  9.1970  9.2128
## (Intercept)                  .       .     
## GenderM                      0.0240  0.0128
## Age18-25                     .      -0.0022
## Age26-35                     .      -0.0008
## Age36-45                     .       0.0023
## Age46-50                     .      -0.0001
## Age51-55                     .       0.0072
## Age55+                       .       0.0022
## Occupation1                  .      -0.0070
## Occupation2                  .      -0.0065
## Occupation3                  .      -0.0011
## Occupation4                  .      -0.0022
## Occupation5                  .       0.0052
## Occupation6                  .       0.0025
## Occupation7                  .       0.0043
## Occupation8                  .       0.0061
## Occupation9                  .      -0.0106
## Occupation10                 .      -0.0116
## Occupation11                 .      -0.0004
## Occupation12                 .       0.0083
## Occupation13                 .       0.0021
## Occupation14                 .       0.0073
## Occupation15                 .       0.0061
## Occupation16                 .       0.0050
## Occupation17                 .       0.0094
## Occupation18                 .       0.0035
## Occupation19                 .      -0.0138
## Occupation20                 .      -0.0089
## City_CategoryB               .      -0.0035
## City_CategoryC               0.0355  0.0136
## Stay_In_Current_City_Years1  .       0.0000
## Stay_In_Current_City_Years2  .       0.0023
## Stay_In_Current_City_Years3  .      -0.0003
## Stay_In_Current_City_Years4+ .       0.0010
## Marital_Status               .       0.0009
r2Lassomin <- LassoMod$glmnet.fit$dev.ratio[which(LassoMod$glmnet.fit$lambda == LassoMod$lambda.min)]
r2Lasso1se <- LassoMod$glmnet.fit$dev.ratio[which(LassoMod$glmnet.fit$lambda == LassoMod$lambda.1se)]
r2Lassomin
## [1] 0.0105828
r2Lasso1se
## [1] 0.004431786
r2Ridgemin <- RidgeMod$glmnet.fit$dev.ratio[which(RidgeMod$glmnet.fit$lambda == RidgeMod$lambda.min)]
r2Ridge1se <- RidgeMod$glmnet.fit$dev.ratio[which(RidgeMod$glmnet.fit$lambda == RidgeMod$lambda.1se)]
r2Ridgemin
## [1] 0.01057293
r2Ridge1se
## [1] 0.00356048

Forward Stepwise Model

library("leaps")
#was not sure on good value to set for nvmax so went with the default of 8
BFFitFwd <- regsubsets(Purchase ~ Gender + Age + Occupation + City_Category + Stay_In_Current_City_Years + Marital_Status , data = BlackFridayDF, nvmax = 8, method = "forward")
summary(BFFitFwd)
## Subset selection object
## Call: regsubsets.formula(Purchase ~ Gender + Age + Occupation + City_Category + 
##     Stay_In_Current_City_Years + Marital_Status, data = BlackFridayDF, 
##     nvmax = 8, method = "forward")
## 34 Variables  (and intercept)
##                              Forced in Forced out
## GenderM                          FALSE      FALSE
## Age18-25                         FALSE      FALSE
## Age26-35                         FALSE      FALSE
## Age36-45                         FALSE      FALSE
## Age46-50                         FALSE      FALSE
## Age51-55                         FALSE      FALSE
## Age55+                           FALSE      FALSE
## Occupation1                      FALSE      FALSE
## Occupation2                      FALSE      FALSE
## Occupation3                      FALSE      FALSE
## Occupation4                      FALSE      FALSE
## Occupation5                      FALSE      FALSE
## Occupation6                      FALSE      FALSE
## Occupation7                      FALSE      FALSE
## Occupation8                      FALSE      FALSE
## Occupation9                      FALSE      FALSE
## Occupation10                     FALSE      FALSE
## Occupation11                     FALSE      FALSE
## Occupation12                     FALSE      FALSE
## Occupation13                     FALSE      FALSE
## Occupation14                     FALSE      FALSE
## Occupation15                     FALSE      FALSE
## Occupation16                     FALSE      FALSE
## Occupation17                     FALSE      FALSE
## Occupation18                     FALSE      FALSE
## Occupation19                     FALSE      FALSE
## Occupation20                     FALSE      FALSE
## City_CategoryB                   FALSE      FALSE
## City_CategoryC                   FALSE      FALSE
## Stay_In_Current_City_Years1      FALSE      FALSE
## Stay_In_Current_City_Years2      FALSE      FALSE
## Stay_In_Current_City_Years3      FALSE      FALSE
## Stay_In_Current_City_Years4+     FALSE      FALSE
## Marital_Status                   FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
##          GenderM Age18-25 Age26-35 Age36-45 Age46-50 Age51-55 Age55+
## 1  ( 1 ) " "     " "      " "      " "      " "      " "      " "   
## 2  ( 1 ) "*"     " "      " "      " "      " "      " "      " "   
## 3  ( 1 ) "*"     " "      " "      " "      " "      " "      " "   
## 4  ( 1 ) "*"     " "      " "      " "      " "      " "      " "   
## 5  ( 1 ) "*"     " "      " "      " "      " "      " "      " "   
## 6  ( 1 ) "*"     " "      " "      " "      " "      " "      " "   
## 7  ( 1 ) "*"     " "      " "      " "      " "      " "      " "   
## 8  ( 1 ) "*"     " "      " "      " "      " "      "*"      " "   
##          Occupation1 Occupation2 Occupation3 Occupation4 Occupation5
## 1  ( 1 ) " "         " "         " "         " "         " "        
## 2  ( 1 ) " "         " "         " "         " "         " "        
## 3  ( 1 ) " "         " "         " "         " "         " "        
## 4  ( 1 ) " "         " "         " "         " "         " "        
## 5  ( 1 ) " "         " "         " "         " "         " "        
## 6  ( 1 ) " "         " "         " "         " "         " "        
## 7  ( 1 ) "*"         " "         " "         " "         " "        
## 8  ( 1 ) "*"         " "         " "         " "         " "        
##          Occupation6 Occupation7 Occupation8 Occupation9 Occupation10
## 1  ( 1 ) " "         " "         " "         " "         " "         
## 2  ( 1 ) " "         " "         " "         " "         " "         
## 3  ( 1 ) " "         " "         " "         " "         "*"         
## 4  ( 1 ) " "         " "         " "         " "         "*"         
## 5  ( 1 ) " "         " "         " "         " "         "*"         
## 6  ( 1 ) " "         " "         " "         " "         "*"         
## 7  ( 1 ) " "         " "         " "         " "         "*"         
## 8  ( 1 ) " "         " "         " "         " "         "*"         
##          Occupation11 Occupation12 Occupation13 Occupation14 Occupation15
## 1  ( 1 ) " "          " "          " "          " "          " "         
## 2  ( 1 ) " "          " "          " "          " "          " "         
## 3  ( 1 ) " "          " "          " "          " "          " "         
## 4  ( 1 ) " "          " "          " "          " "          " "         
## 5  ( 1 ) " "          " "          " "          " "          " "         
## 6  ( 1 ) " "          " "          " "          " "          " "         
## 7  ( 1 ) " "          " "          " "          " "          " "         
## 8  ( 1 ) " "          " "          " "          " "          " "         
##          Occupation16 Occupation17 Occupation18 Occupation19 Occupation20
## 1  ( 1 ) " "          " "          " "          " "          " "         
## 2  ( 1 ) " "          " "          " "          " "          " "         
## 3  ( 1 ) " "          " "          " "          " "          " "         
## 4  ( 1 ) " "          " "          " "          " "          " "         
## 5  ( 1 ) " "          " "          " "          "*"          " "         
## 6  ( 1 ) " "          " "          " "          "*"          "*"         
## 7  ( 1 ) " "          " "          " "          "*"          "*"         
## 8  ( 1 ) " "          " "          " "          "*"          "*"         
##          City_CategoryB City_CategoryC Stay_In_Current_City_Years1
## 1  ( 1 ) " "            "*"            " "                        
## 2  ( 1 ) " "            "*"            " "                        
## 3  ( 1 ) " "            "*"            " "                        
## 4  ( 1 ) "*"            "*"            " "                        
## 5  ( 1 ) "*"            "*"            " "                        
## 6  ( 1 ) "*"            "*"            " "                        
## 7  ( 1 ) "*"            "*"            " "                        
## 8  ( 1 ) "*"            "*"            " "                        
##          Stay_In_Current_City_Years2 Stay_In_Current_City_Years3
## 1  ( 1 ) " "                         " "                        
## 2  ( 1 ) " "                         " "                        
## 3  ( 1 ) " "                         " "                        
## 4  ( 1 ) " "                         " "                        
## 5  ( 1 ) " "                         " "                        
## 6  ( 1 ) " "                         " "                        
## 7  ( 1 ) " "                         " "                        
## 8  ( 1 ) " "                         " "                        
##          Stay_In_Current_City_Years4+ Marital_Status
## 1  ( 1 ) " "                          " "           
## 2  ( 1 ) " "                          " "           
## 3  ( 1 ) " "                          " "           
## 4  ( 1 ) " "                          " "           
## 5  ( 1 ) " "                          " "           
## 6  ( 1 ) " "                          " "           
## 7  ( 1 ) " "                          " "           
## 8  ( 1 ) " "                          " "
#created separate variables for city catrgories B and C since the forward stepwise model chose them
BlackFridayDF$City_CategoryB <- ifelse(BlackFridayDF$City_Category == "B",1,0)
BlackFridayDF$City_CategoryC <- ifelse(BlackFridayDF$City_Category == "C",1,0)
#created separate variable for the age group of people between 51 and 55
BlackFridayDF$Age51 <- ifelse(BlackFridayDF$Age == "51-55",1,0)
#created separate variables for the occupations the model chose
BlackFridayDF$Occupation1 <- ifelse(BlackFridayDF$Occupation == "1",1,0)
BlackFridayDF$Occupation10 <- ifelse(BlackFridayDF$Occupation == "10",1,0)
BlackFridayDF$Occupation19 <- ifelse(BlackFridayDF$Occupation == "19",1,0)
BlackFridayDF$Occupation20 <- ifelse(BlackFridayDF$Occupation == "20",1,0)

lmDayDF <- lm(Purchase ~ Gender + City_CategoryB + City_CategoryC + Age51 + Occupation1 + Occupation10 + Occupation19 + Occupation20, data = BlackFridayDF)
summary(lmDayDF)
## 
## Call:
## lm(formula = Purchase ~ Gender + City_CategoryB + City_CategoryC + 
##     Age51 + Occupation1 + Occupation10 + Occupation19 + Occupation20, 
##     data = BlackFridayDF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0752 -0.2652  0.1459  0.4273  1.0323 
## 
## Coefficients:
##                 Estimate Std. Error  t value             Pr(>|t|)    
## (Intercept)     9.131105   0.004144 2203.393 < 0.0000000000000002 ***
## GenderM         0.074262   0.003529   21.044 < 0.0000000000000002 ***
## City_CategoryB  0.028850   0.003726    7.744  0.00000000000000971 ***
## City_CategoryC  0.100927   0.003900   25.878 < 0.0000000000000002 ***
## Age51           0.036187   0.005883    6.151  0.00000000077334193 ***
## Occupation1    -0.040556   0.005442   -7.452  0.00000000000009254 ***
## Occupation10   -0.083945   0.009270   -9.055 < 0.0000000000000002 ***
## Occupation19   -0.091420   0.012247   -7.464  0.00000000000008412 ***
## Occupation20   -0.048856   0.006463   -7.560  0.00000000000004057 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5942 on 164269 degrees of freedom
## Multiple R-squared:  0.009239,   Adjusted R-squared:  0.00919 
## F-statistic: 191.5 on 8 and 164269 DF,  p-value: < 0.00000000000000022

Calculates RMSE and MSE over all the models

RMSE = function(x1,x2) {
  sqrt(mean(x1-x2)^2)
}

lmRMSE = RMSE(predict(Linear_regression, newx = Xvar), BlackFridayDF$Purchase)
ltRMSE = RMSE(predict(logitFit, newx = Xvar), BlackFridayDF$Purchase)
LassoRMSE = RMSE(predict(LassoMod, newx = Xvar), BlackFridayDF$Purchase)
RidgeRMSE = RMSE(predict(RidgeMod, newx = Xvar), BlackFridayDF$Purchase)
lmDayDFRMSE = RMSE(predict(lmDayDF, newx = Xvar), BlackFridayDF$Purchase)

MSE <- function(yhat, ytrue){ 
  mean((yhat - ytrue)^2)
}

lmMSE = MSE(predict(Linear_regression, newx = Xvar), BlackFridayDF$Purchase)
ltMSE = MSE(predict(logitFit, newx = Xvar), BlackFridayDF$Purchase)
LassoMSE = MSE(predict(LassoMod, newx = Xvar), BlackFridayDF$Purchase)
RidgeMSE = MSE(predict(RidgeMod, newx = Xvar), BlackFridayDF$Purchase)
lmDayDFMSE = RMSE(predict(lmDayDF, newx = Xvar), BlackFridayDF$Purchase)

compareModels <- matrix(c(lmRMSE,ltRMSE, LassoRMSE,RidgeRMSE, lmDayDFRMSE, lmMSE, ltMSE, LassoMSE,RidgeMSE, lmDayDFMSE),ncol=2,byrow=TRUE)
colnames(compareModels) <- c("RMSE","MSE")
rownames(compareModels) <- c("Linear Regression", "Generalized Linear","Lasso","Ridge", "Forward Stepwise")
compareModels <- as.table(compareModels)
compareModels
##                                       RMSE                     MSE
## Linear Regression  0.000000000000004791599 0.000000000000004791599
## Generalized Linear 0.000000000000062269982 0.000000000000066568344
## Lasso              0.000000000000003284116 0.352633274631699078494
## Ridge              0.352633274631699078494 0.354826116309063299692
## Forward Stepwise   0.355136654667222717574 0.000000000000003284116